{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "dir_path = './Amazon_Review/CDs/'\n",
    "rating_file = 'ratings_CDs_and_Vinyl.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>item_id</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>AHG1GTQZUYNJN</td>\n",
       "      <td>0001393774</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1372723200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>A1WX42M589VAMQ</td>\n",
       "      <td>0001393774</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1167350400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>A2UA9KKUQCTEIN</td>\n",
       "      <td>0001501348</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1381017600</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           user_id     item_id  rating   timestamp\n",
       "1    AHG1GTQZUYNJN  0001393774     5.0  1372723200\n",
       "10  A1WX42M589VAMQ  0001393774     5.0  1167350400\n",
       "20  A2UA9KKUQCTEIN  0001501348     5.0  1381017600"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def read_user_rating_records():\n",
    "    col_names = ['user_id', 'item_id', 'rating', 'timestamp']\n",
    "    data_records = pd.read_csv(dir_path + rating_file, sep=',', names=col_names, engine='python')\n",
    "    return data_records\n",
    "\n",
    "data_records = read_user_rating_records()\n",
    "data_records.head()\n",
    "data_records.iloc[[1, 10, 20]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1578597 486360\n"
     ]
    }
   ],
   "source": [
    "print(len(data_records['user_id'].value_counts()), len(data_records['item_id'].value_counts()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1425149 457623\n"
     ]
    }
   ],
   "source": [
    "data_records.loc[data_records.rating < 4, 'rating'] = 0\n",
    "data_records.loc[data_records.rating >= 4, 'rating'] = 1\n",
    "data_records = data_records[data_records.rating > 0]\n",
    "print(len(data_records['user_id'].unique()), len(data_records['item_id'].unique()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "users with < 5 interactoins are removed\n",
      "items with < 8 interactoins are removed\n",
      "num of users:88016, num of items:35287\n"
     ]
    }
   ],
   "source": [
    "from copy import deepcopy\n",
    "def remove_infrequent_items(data, min_counts=5):\n",
    "    df = deepcopy(data)\n",
    "    counts = df['item_id'].value_counts()\n",
    "    df = df[df[\"item_id\"].isin(counts[counts >= min_counts].index)]\n",
    "\n",
    "    print(\"items with < {} interactoins are removed\".format(min_counts))\n",
    "    # print(df.describe())\n",
    "    return df\n",
    "\n",
    "def remove_infrequent_users(data, min_counts=10):\n",
    "    df = deepcopy(data)\n",
    "    counts = df['user_id'].value_counts()\n",
    "    df = df[df[\"user_id\"].isin(counts[counts >= min_counts].index)]\n",
    "\n",
    "    print(\"users with < {} interactoins are removed\".format(min_counts))\n",
    "    # print(df.describe())\n",
    "    return df\n",
    "filtered_data = remove_infrequent_users(data_records, 5)\n",
    "filtered_data = remove_infrequent_items(filtered_data, 8)\n",
    "print('num of users:{}, num of items:{}'.format(len(filtered_data['user_id'].unique()), len(filtered_data['item_id'].unique())))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "item_id\n",
      "B00004TAH2    8\n",
      "B0009F66PY    8\n",
      "B0009EZ0Q6    8\n",
      "B004ZLBTNW    8\n",
      "B00001T3H5    8\n",
      "B004ZLBU6I    8\n",
      "B0009ESYAK    8\n",
      "B001RXB4PO    8\n",
      "B00001U035    8\n",
      "B000003Z9K    8\n",
      "dtype: int64\n",
      "user_id\n",
      "ADIHIVQWXQOZ5     1\n",
      "A2SXWBOWFJTG6Y    1\n",
      "A62B55P1JZD5O     1\n",
      "A2SXGE7QAO5T7M    1\n",
      "A1V29PLRGPVLH2    1\n",
      "A1V2CR670P3D12    1\n",
      "ALUL6JHG0DUH      1\n",
      "A1V2R3MO1ZVJE2    1\n",
      "A1V2SWRG6B2TDE    1\n",
      "AUZAX88EP2RD5     1\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "print(filtered_data.groupby('item_id').size().sort_values(ascending=True)[:10])\n",
    "print(filtered_data.groupby('user_id').size().sort_values(ascending=True)[:10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['0738900370' '0738900672' '0738919039' '0738920363' '0760135002'\n",
      " '0767804341' '0767816641' '076783822X' '0769716903' '0769720994']\n"
     ]
    }
   ],
   "source": [
    "# read item's reviews\n",
    "item_list = filtered_data['item_id'].unique()\n",
    "item_set = set(item_list)\n",
    "\n",
    "print(item_list[:10])\n",
    "\n",
    "review_file = 'reviews_CDs_and_Vinyl_5.json.gz'\n",
    "\n",
    "import json\n",
    "import gzip\n",
    "\n",
    "def parse(path):\n",
    "    g = gzip.open(path, 'r')\n",
    "    for l in g:\n",
    "        yield json.loads(l)\n",
    "        # yield json.dumps(eval(l))\n",
    "\n",
    "review_dict = dict()  # [review_id] = review_text\n",
    "review_helpful = dict()\n",
    "for l in parse(dir_path + review_file):\n",
    "    if l['asin'] in item_set:\n",
    "        if l['asin'] in review_dict:\n",
    "            if l['helpful'][0] / float(l['helpful'][1] + 0.01) > review_helpful[l['asin']] and len(l['reviewText']) > 10:\n",
    "                review_dict[l['asin']] = l['reviewText']\n",
    "                review_helpful[l['asin']] = l['helpful'][0] / float(l['helpful'][1] + 0.01)\n",
    "        else:\n",
    "            if len(l['reviewText']) > 10:\n",
    "                review_dict[l['asin']] = l['reviewText']\n",
    "                review_helpful[l['asin']] = l['helpful'][0] / float(l['helpful'][1] + 0.01)\n",
    "\n",
    "# print review_dict['1300966947']\n",
    "            "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['1591796423', '6303439330', '6305712085', 'B000001V37', 'B000001ZWU', 'B0000024F7', 'B0000024UD', 'B000002C45', 'B000002IIN', 'B000002NUD', 'B000002O44', 'B000002SEL', 'B000003YFA', 'B0000046K4', 'B000005KWU', 'B000005L8T', 'B000005NMW', 'B000005TN9', 'B000008M3P', 'B000008UPI', 'B00000G24T', 'B00000ICH9', 'B00001X59W', 'B000028BCJ', 'B00003CK5M', 'B00003G1KA', 'B00004HYKL', 'B00004SR1J', 'B00004TW91', 'B00004WF6M', 'B00005MMN9', 'B0000648Y6', 'B000078JK0', 'B0000AQJOV', 'B0000BWVA3', 'B0000DBJAS', 'B0001IXTU4', 'B0001L7RZY', 'B0002LMLSQ', 'B0002LO7GU', 'B0002PUH1U', 'B00061NLEK', 'B00065U04A', 'B00065VSKU', 'B0008ENI02', 'B0009VNCG4', 'B000B9EYFM', 'B000BH4Z08', 'B000CAKQ0M', 'B000F2CBZU', 'B000F6YW5S', 'B000I2J7M2', 'B000JJSJBE', 'B000LC4ZNA', 'B000P46S3E', 'B000PTYUR0', 'B000Q35SAI', 'B000R0ML28', 'B000SGHSLU', 'B000VJE1VY', 'B000VKL6UW', 'B000VSYHL4', 'B000WZ8RVC', 'B000X1L852', 'B000XC906K', 'B000XVT7SM', 'B001AIG41K', 'B001BFZ0RG', 'B001BSH10M', 'B001CIOCRQ', 'B001QFNSC0', 'B001QWFUCY', 'B0025XV1LW', 'B002F3BOXC', 'B002LZUKUC', 'B002NVTBOU', 'B002R0HT28', 'B0032Y8XKA', 'B0035LURDG', 'B0039TD7WC', 'B003AHWF58', 'B003CKMNAA', 'B0044WMPZI', 'B004ODO77G', 'B004X6J3DM', 'B004YOECTI', 'B005I735LG', 'B006X0K1ZA', 'B007JSXQIO', 'B007KLY8FA', 'B007RBKZRI', 'B008NO1PUE', 'B009CYYJIE', 'B00AV4DXEA', 'B00B5UBGTS', 'B00D1QBQ2G', 'B00DSAUMXO']\n",
      "35190\n"
     ]
    }
   ],
   "source": [
    "# delete items without reviews\n",
    "item_without_review = []\n",
    "for item_id in item_list:\n",
    "    if item_id not in review_dict:\n",
    "        item_without_review.append(item_id)\n",
    "\n",
    "print(item_without_review)\n",
    "\n",
    "for item_id in item_without_review:\n",
    "    filtered_data = filtered_data[filtered_data['item_id'] != item_id]\n",
    "\n",
    "item_list = filtered_data['item_id'].unique()\n",
    "print(len(item_list))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "for item_id, review in review_dict.items():\n",
    "    if len(review) < 5:\n",
    "        print(item_id)\n",
    "# print review_dict['B002IUAUI2']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "succressfully created sequencial data! head: user_id\n",
      "A010397922UKJ9QFDYFIE    [B000VH1AZQ, B000CC1TGA, B00BEJ5O6Q, B009A87WM...\n",
      "A01981332V9QR1ZDG2WLS                 [B0043C3DZU, B00007E8J1, B00004WIR6]\n",
      "A0200189274BR0E83NMS9                             [157252409X, B000CBVMLS]\n",
      "A02039013W06XH9FVVFUZ                 [B000008UGB, B00004TRUP, B000002H25]\n",
      "A02852361P0OLWYC4FW7X     [B000008FVT, B000007NZV, B000053GU0, B00004TL26]\n",
      "Name: item_id, dtype: object\n",
      "user_id\n",
      "A08161909WK3HU7UYTMW     [B00005OMGE, B0041WLBEC, B00005YWFF, B000069JJ...\n",
      "A099280716ZEH5UPWAN4A    [B000T2PRJI, B004GZSWO6, B0000C23DW, B0000025O...\n",
      "A1006V961PBMKA           [B000065BW8, B000003TAW, B00006V9A0, B00008J4P...\n",
      "A100JCBNALJFAW           [B0001NBMBC, B00000099X, B0007WF1X2, B0007Y4TV...\n",
      "A100TF7VLG8RBV           [B000001FXX, B00002MYYI, B000007OG6, B00001QGP...\n",
      "A10127132IE1A73IN1HGO    [B004T4YPI6, B0052V0NQ8, B005EIHMU4, B009LXEBD...\n",
      "A1012N48J0Z65N           [B000002KFJ, B00003TFVK, B000002U8G, B0000033P...\n",
      "A101L4HF0IZ33C           [B00000DHTF, B008LZHA3G, B00004Y9YB, B000059T9...\n",
      "A101QL3M0GPBYG           [B000000W6N, B0000CG89W, B000FDFOX0, 630237172...\n",
      "A1020L7BWW9RAX           [B000075A20, B00006SM81, B00005LB1H, B00005ABN...\n",
      "Name: item_id, dtype: object\n",
      "17052\n",
      "<class 'pandas.core.series.Series'>\n"
     ]
    }
   ],
   "source": [
    "# convert records to sequential data per user\n",
    "def convert_data(data):\n",
    "    # for each user, sort by timestamps\n",
    "    df = deepcopy(data)\n",
    "    df_ordered = df.sort_values(['timestamp'], ascending=True)\n",
    "    data = df_ordered.groupby('user_id')['item_id'].apply(list)\n",
    "    #print(data)\n",
    "    #time_l = df_ordered.groupby('user')['checkin_time'].apply(list)\n",
    "    #print(time_l)\n",
    "    print(\"succressfully created sequencial data! head:\", data.head(5))\n",
    "    unique_data = df_ordered.groupby('user_id')['item_id'].nunique()\n",
    "    data = data[unique_data[unique_data >= 10].index]\n",
    "    print(data[:10])\n",
    "    print(len(data))\n",
    "    return data\n",
    "\n",
    "seq_data = convert_data(filtered_data)\n",
    "print(type(seq_data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "17052 35118\n"
     ]
    }
   ],
   "source": [
    "user_item_dict = seq_data.to_dict()\n",
    "user_mapping = []\n",
    "item_set = set()\n",
    "for user_id, item_list in seq_data.iteritems():\n",
    "    user_mapping.append(user_id)\n",
    "    for item_id in item_list:\n",
    "        item_set.add(item_id)\n",
    "item_mapping = list(item_set)\n",
    "\n",
    "print(len(user_mapping), len(item_mapping))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[19424, 1824, 4592, 3047, 16232, 28137, 30957, 15135, 5251, 9053, 31521, 28819, 466, 3289, 9602, 14408, 18384, 6926, 16823, 26584, 19449, 4850, 27287, 5117, 4694, 8868, 10830, 17932, 15224, 18093, 34884, 26394, 15488], [16607, 100, 23666, 9859, 6568, 31416, 15922, 31905, 21970, 4694, 16572, 3911], [8356, 25134, 3234, 28418, 25791, 16496, 20149, 12591, 17475, 19735, 12121, 32492, 21821, 6903], [18275, 9043, 6102, 9672, 27869, 1142, 10394, 34153, 7212, 24662, 14752, 14589], [30596, 4267, 28556, 17301, 21730, 10942, 29486, 25925, 25134, 13980, 13965, 14331, 27004, 2408, 4529]]\n"
     ]
    }
   ],
   "source": [
    "def generate_inverse_mapping(data_list):\n",
    "    inverse_mapping = dict()\n",
    "    for inner_id, true_id in enumerate(data_list):\n",
    "        inverse_mapping[true_id] = inner_id\n",
    "    return inverse_mapping\n",
    "\n",
    "def convert_to_inner_index(user_records, user_mapping, item_mapping):\n",
    "    inner_user_records = []\n",
    "    user_inverse_mapping = generate_inverse_mapping(user_mapping)\n",
    "    item_inverse_mapping = generate_inverse_mapping(item_mapping)\n",
    "\n",
    "    for user_id in range(len(user_mapping)):\n",
    "        real_user_id = user_mapping[user_id]\n",
    "        item_list = list(user_records[real_user_id])\n",
    "        for index, real_item_id in enumerate(item_list):\n",
    "            item_list[index] = item_inverse_mapping[real_item_id]\n",
    "        inner_user_records.append(item_list)\n",
    "\n",
    "    return inner_user_records, user_inverse_mapping, item_inverse_mapping\n",
    "\n",
    "inner_data_records, user_inverse_mapping, item_inverse_mapping = convert_to_inner_index(user_item_dict, user_mapping, item_mapping)\n",
    "print(inner_data_records[:5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pickle\n",
    "def save_obj(obj, name ):\n",
    "    with open(name + '.pkl', 'wb') as f:\n",
    "        pickle.dump(obj, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "save_obj(inner_data_records, 'CDs_item_sequences')\n",
    "save_obj(user_mapping, 'CDs_user_mapping')\n",
    "save_obj(item_mapping, 'CDs_item_mapping')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
